/* smog_collapse.do: This script takes the prepared inspection-level panel of Smog Checks,
combines it with station quality scores, and collapses to the county-day level.  

Note that the inputs to this program were not included in the data archive, but the output,
star_counties.do, is included*/

clear all

cd "${path}build"

capture log close
log using smog_collapse.log, replace
set more off

/*We want to connect a daily series of station-level numbers of inspections to the (quarterly)
STAR-related station characteristics*/


use star_scores 


gen svfr_bin = min(floor(svfr_ratio*4),10)
gen fpr_bin = floor(fpr_current*20)


/*Create a "simulated" current FPR, what would occur if all stations had to be at least 0.4*/

gen fpr_sim_abb = max(fpr_current,0.4) if fpr_current!=.

/*Drop stations with SVFR 0 -- who knows what it means for a station to never
fail a single vehicle all quarter*/

drop if svfr_ratio == 0 | fpr_current == 1

tempfile temp
save `temp'

use smog_precollapse

/*For each day and county, we want the number of re-passes which occured, and we want to know how many re-passes
occured in ASM checks, on old cars, and the number off each of those which occurred at stations meeting each
of the STAR qualifiers*/

sort vin date teststart

/* Also want types of re-pass by age and the way they failed*/
by vin: gen cycle = sum(initial_inspection)

sort vin cycle date teststart

by vin cycle: gen re_pass = fail == 0 & sum(fail)>0  & _n == _N
//gen re_pass = fail == 0 & initial_inspection ==0 & ((initial_inspection[_n+1]==1 & vin[_n+1]==vin)|vin[_n+1]!=vin)

gen pass = fail==0 & initial_inspection ==1

/*Generate a couple of new variables so that we can describe the profile of vehicles being inspected at each station*/

gen asm = test_type == "A" if re_pass //Only non-missing for re-passes so we only count one per cycle

gen directed = inlist(insp_reasn,"D","S","P") 

/* Indicators for vehicle age at re-passes.  We will sum these up to get the total number of inspections
by type*/

gen firstgen = modelyr<1985 if re_pass
gen secondgen = modelyr >=1985 & modelyr<1996 if re_pass
gen thirdgen = modelyr >=1996 if re_pass
gen othergen = modelyr>=1985  if re_pass

/* Number of passed initial inspections by age*/

gen oldpass = modelyr < 1985 if pass
gen newpass = modelyr >=1985 if pass

/* Number of all initial inspections by age*/

gen old_init = modelyr<1985 if initial_inspection
gen new_init = modelyr>=1985 if initial_inspection


/* Also want types of re-pass by age and the way they failed*/


by vin cycle: gen re_pass_nonemis = sum(initial_inspection*(overall=="F" & ovral_emis=="P"))
by vin cycle: gen re_pass_emis = sum(initial_inspection*(ovral_emis =="F"))
by vin cycle: gen re_pass_gross = sum(initial_inspection*(ovral_emis == "G"))



foreach fail in nonemis emis gross{
	qui replace re_pass_`fail' = 0 if re_pass == 0
	gen firstgen_`fail' = modelyr<1985 if re_pass_`fail'
	gen othergen_`fail' = modelyr>=1985 if re_pass_`fail'
}

//gen tenplus = year-modelyr>=10 if re_pass

/* Diagnostic: should only be one re-pass per cycle
by vin: gen cycle = sum(initial_inspection)
bysort vin cycle: egen re_pass_total = total(re_pass)
tab re_pass_total
*/

gen fail_initial = initial_inspection & fail
gen fail_old = modelyr < 1985 if fail_initial
gen fail_new = modelyr >=1985 if fail_initial

/*Collapse to one observation per station per day, then link station scores*/
gcollapse quarter (count) N = year (sum) fail_* initial pass oldpass newpass old_init new_init re_pass* directed /*tenplus asm*/ *gen*, by(station_id date) fast unsorted


joinby station_id quarter using `temp'


/*
forvalues i= 0/10{
	foreach var in firstgen othergen re_pass {
		gen `var'_svfrbin`i' = `var'*(svfr_bin==`i')
	}
}
*/
forvalues i= 0/19{
	foreach var in firstgen  othergen {

		gen `var'_fprbin`i' = `var'*(fpr_bin==`i')
	}
}



drop fpr_bin svfr_bin

save `temp', replace

/*Collapse to get totals by day and county, */

gcollapse (sum)  directed /*asm tenplus*/ *gen* N initial old_init new_init oldpass newpass pass re_pass* fail_*, by(date county) fast unsorted



tempfile totals
save `totals'

/*Now go back and get average FPR/SVFR weighted by all re_passes*/
use `temp'
gcollapse fpr_current svfr_ratio  [fw=re_pass], by(date county) fast unsorted

tempfile re_pass
save `re_pass'



/*Now go back and get the FPR/SVFR weighted by older and newer re_passes*/

use `temp' 

gcollapse fpr_current_firstgen = fpr_current svfr_ratio_firstgen = svfr_ratio fpr_sim_abb  [fw=firstgen], by(date county) fast unsorted

tempfile firstgen
save `firstgen'

use `temp'

gcollapse fpr_current_othergen = fpr_current svfr_ratio_othergen = svfr_ratio  [fw=othergen], by(date county) fast unsorted

tempfile othergen

save `othergen'

/*Now go back and calculate current FPR, re-assigning firstgen inspections at current FPR stations to other stations*/

use `temp'

gen bad_station = (fpr_current<0.4)

/*Total re-inspections to redistribute*/
bysort county date: egen total_bad = total(firstgen*(bad_station))

/*We will re-distribute according to market share of good stations in the current quarter*/
bysort county quarter: egen total_good = total(firstgen*(bad_station == 0))
bysort station quarter: egen this_station_good = total(firstgen)

gen share_good = this_station_good/total_good

/*Calculate a weight equal to zero if you are below the cutoffs, and your actual firstgen plus your share of the bad inspections if you're above*/
gen fpr_sim_weight = (fpr_current>=0.4)*(firstgen + round(total_bad*share_good))

/*For some county-days, the good stations that report inspections have a low enough market share that the counts round down, leaving some 
re-inspections unassigned.  Manually add these, starting with the highest market share stations*/

/*First calculate the total number of re-inspections reassigned each county-day*/
bysort county date: egen extra_good = total((fpr_sim_weight-firstgen)*(bad_station==0))

/*And the number of good stations they are being assigned to*/
by county date: egen good_stations = total((bad_station == 0))

/*This gives the extra to be assigned*/
gen unassigned = total_bad - extra_good

/*If there are more extra than good stations (it happens), start by adding 1 to all good stations.  Do this several times, since there's
a small handful that have many more bad stations than good*/

forvalues i = 1/5{
	replace fpr_sim_weight = fpr_sim_weight + 1 if (bad_station == 0) & unassigned > good_stations
	replace unassigned = unassigned - good_stations if unassigned > good_stations
}

/*Now sort by market share and apply the remainder*/

gsort county date bad_station - share_good

by county date bad_station: replace fpr_sim_weight = fpr_sim_weight + 1 if bad_station == 0 & _n<= unassigned

gcollapse fpr_sim_firstgen = fpr_current  [fw=fpr_sim_weight], by(date county) fast unsorted

tempfile fpr_sim
save `fpr_sim'

/*Now go back and calculate current FPR, re-assigning firstgen inspections at low standard FPR and SVFR stations to other stations*/

use `temp'

gen bad_station = (fpr_score<0.4 | svfr_ratio<.75)

/*Total re-inspections to redistribute*/
bysort county date: egen total_bad = total(firstgen*(bad_station))

/*We will re-distribute according to market share of good stations in the current quarter*/
bysort county quarter: egen total_good = total(firstgen*(bad_station == 0))
bysort station quarter: egen this_station_good = total(firstgen)

gen share_good = this_station_good/total_good

/*Calculate a weight equal to zero if you are below the cutoffs, and your actual firstgen plus your share of the bad inspections if you're above*/
gen fpr_sim_weight = (fpr_score>=0.4 & svfr_ratio>=.75)*(firstgen + round(total_bad*share_good))

/*For some county-days, the good stations that report inspections have a low enough market share that the counts round down, leaving some 
re-inspections unassigned.  Manually add these, starting with the highest market share stations*/

/*First calculate the total number of re-inspections reassigned each county-day*/
bysort county date: egen extra_good = total((fpr_sim_weight-firstgen)*(bad_station==0))

/*And the number of good stations they are being assigned to*/
by county date: egen good_stations = total((bad_station == 0))

/*This gives the extra to be assigned*/
gen unassigned = total_bad - extra_good

/*If there are more extra than good stations (it happens), start by adding 1 to all good stations.  Do this several times, since there's
a small handful that have many more bad stations than good*/

forvalues i = 1/5{
	replace fpr_sim_weight = fpr_sim_weight + 1 if (bad_station == 0) & unassigned > good_stations
	replace unassigned = unassigned - good_stations if unassigned > good_stations
}

/*Now sort by market share and apply the remainder*/

gsort county date bad_station - share_good

by county date bad_station: replace fpr_sim_weight = fpr_sim_weight + 1 if bad_station == 0 & _n<= unassigned

gcollapse fpr_real_sim = fpr_current  [fw=fpr_sim_weight], by(date county) fast unsorted

joinby date county using `firstgen', unm(both)

capture drop _merge

joinby date county using `othergen', unm(both)

capture drop _merge

joinby date county using `re_pass', unm(both)

capture drop _merge

joinby date county using `fpr_sim', unm(both)

capture drop _merge

joinby date county using `totals', unm(both)

capture drop _merge

/*Now we have the number of vehicles inspected on each day in each county (and similar counts for sub-categories).
Next, we'll convert this to a running total of the number of vehicles inspected within the last 90 days*/

tsset county date

tsfill 

/*Calculate 30, 60, 90 and 120-day rolling sums*/


foreach i in 15 30 60 90 120{  
/*First get the average quality scores */
	foreach var of varlist  fpr_current*  *sim*{
		capture confirm var `var'_`i'
		if regexm("`var'","_[1]?[0-9][05]$") | _rc==0 continue
		qui gen `var'_`i' = `var'
		qui replace `var'_`i' = 0 if `var' == .
		qui tssmooth ma `var'_`i' = `var'_`i'*`++i', window(`--i' 1) replace
		qui tssmooth ma `var'_nonmiss = (missing(`var')==0)*`++i', window(`--i' 1)
		qui replace `var'_`i' = `var'_`i'/`var'_nonmiss
		drop `var'_nonmiss
	}
	/*Now get the running total of all the other variables*/
	foreach var of varlist N  /*directed asm tenplus*/ firstgen *nonemis *_emis *gross secondgen thirdgen othergen *pass old_init new_init initial_inspection fail_* *bin*{
		capture confirm var `var'_`i'
		if regexm("`var'","_[1]?[0-9][05]$") | _rc==0 continue
		qui gen `var'_`i' = `var'
		qui replace `var'_`i' = 0 if `var' ==.
		qui tssmooth ma `var'_`i' = `var'_`i'*`++i', window(`--i' 1) replace
	}
}
	

compress 
save star_counties, replace

log close
